library(ggplot2)
library(tidyverse)
library(lubridate)
setwd("/Volumes/Courses/ES218_A_SP/wokopa26/Vehicular accidents")
Accidents_Original <- readRDS("farsp.RDS")

alcohol <- read.csv("alcohol.csv")
alcohol <- alcohol %>% 
  mutate(description = if_else(description %in% c("Unknown", "Not reported"), NA, description))

body_typ <- read.csv("body_typ.csv")
body_typ <- body_typ %>% 
  mutate(description = if_else(description %in% c("Unknown body type", "Not Reported"), NA, description))

col_names <- read.csv("column_name_description.csv")

inj_sev <- read.csv("inj_sev.csv")
inj_sev <- inj_sev %>% 
  mutate(description = if_else(description %in% c("Unknown/Not Reported", "N/A"), NA, description))

man_coll <- read.csv("man_coll.csv")
man_coll <- man_coll %>% 
  mutate(description = if_else(description %in% c("Not Reported", "Unknown"), NA, description))

sex <- read.csv("sex.csv")
sex <- sex %>% 
  mutate(description = if_else(description %in% c("not reported", "unknown"), NA, description))

state_code <- read.csv("state_code.csv")
col_names <- setNames(col_names$column_name, col_names$description)
names(col_names) <- gsub(" ", "_", names(col_names))

Names_Changed <- Accidents_Original %>%
  rename(alcohol = drinking) %>% 
  left_join(state_code, by = c("state" = "state_code")) %>%
  left_join(sex, by = "sex") %>%
  left_join(man_coll, by = "man_coll") %>%
  left_join(inj_sev, by = "inj_sev") %>%
  left_join(body_typ, by = "body_typ") %>%
  left_join(alcohol, by = "alcohol") %>%
    mutate(across(c(year, month, day, hour, minute, age), ~replace(., . == 99, NA)),
         across(c(age), ~replace(., . == 999, NA))) %>%
  drop_na(c("year", "month", "day", "hour", "minute")) %>% 
  mutate(
    state = as.character(state_name),
    sex = as.character(description.x),
    man_coll = as.character(description.y),
    alcohol = as.character(description),
    inj_sev = as.character(description.y),
    body_typ = as.character(description.y),
    Datetime = ymd_hm(paste(year, month, day, hour, minute)),
    Month_Name = month(month, label = TRUE)
  ) %>%
  rename(!!!col_names) %>% 
  mutate(
    State = as.factor(State),
    County = as.factor(County),
    Manner_of_collision = as.factor(Manner_of_collision),
    Vehicle_body_type = as.factor(Vehicle_body_type),
    Sex = as.factor(Sex),
    Injury_severity = as.factor(Injury_severity),
    Datetime = as.POSIXct(Datetime),
    Date = as.Date(Datetime),
    Accident_weekday = wday(Datetime, label = TRUE),
    Month_Name = month(Datetime, label = TRUE), # create month name column
    Year = year(Datetime)
    ) %>%
  filter(Age < 100) %>%
  select(-State, -description.x, -description.y, -description.x.x, -description.y.y,-description)
# 1. Number of accidents per state
ggplot(Names_Changed %>% filter(!is.na(state_name)), aes(x = state_name)) +
  geom_bar() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  ggtitle("Number of Accidents per State")

# 2. Number of accidents per month
ggplot(Names_Changed %>% filter(!is.na(Datetime)), aes(x = month(Datetime, label = TRUE))) +
  geom_bar() +
  ggtitle("Number of Accidents per Month")

# 3. Time series plot
ggplot(Names_Changed, aes(x = Date, y = after_stat(count))) +
  geom_line(stat = "count") +
  labs(x = "Date", y = "Accidents", title = "Accident Rates Over Time")

# 4. Number of accidents per hour
ggplot(Names_Changed %>% filter(!is.na(Accident_hour)), aes(x = Accident_hour)) +
  geom_bar() +
  ggtitle("Number of Accidents per Hour")

# 5. Distribution of the age of persons involved in accidents
ggplot(Names_Changed %>% filter(!is.na(Age)), aes(x = Age)) +
  geom_histogram(binwidth = 1) +
  ggtitle("Distribution of Age of Persons Involved in Accidents")

# 6. Number of accidents by sex
ggplot(Names_Changed %>% filter(!is.na(Sex)), aes(x = Sex)) +
  geom_bar() +
  ggtitle("Number of Accidents by Sex")

# 7. Number of accidents with alcohol involvement
ggplot(Names_Changed %>% filter(!is.na(Alcohol_involvement)), aes(x = Alcohol_involvement)) +
  geom_bar() +
  ggtitle("Number of Accidents with Alcohol Involvement")

# 8. Distribution of injury severity - Duplicate?
ggplot(Names_Changed %>% filter(!is.na(Injury_severity)), aes(x = Injury_severity)) +
  geom_bar() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  ggtitle("Distribution of Injury Severity")

# 9. Number of accidents by manner of collision - Duplicate?
ggplot(Names_Changed %>% filter(!is.na(Manner_of_collision)), aes(x = Manner_of_collision)) +
  geom_bar() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  ggtitle("Number of Accidents by Manner of Collision")

# 10. Number of accidents by vehicle body type
ggplot(Names_Changed %>% filter(!is.na(Vehicle_body_type)), aes(x = Vehicle_body_type)) +
  geom_bar() +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  ggtitle("Number of Accidents by Vehicle Body Type")

# 11. Histogram of age
ggplot(Names_Changed, aes(x = Age)) +
  geom_histogram(binwidth = 1) +
  labs(x = "Age", y = "Count", title = "Age Distribution of Persons Involved in Accidents")

# 12. Box plot of age by injury severity
ggplot(Names_Changed, aes(x = Injury_severity, y = Age)) +
  geom_boxplot() +
  labs(x = "Injury Severity", y = "Age", title = "Age Distribution by Injury Severity") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

# 13. Bar plot of manner of collision
ggplot(Names_Changed, aes(x = Manner_of_collision)) +
  geom_bar() +
  labs(x = "Manner of Collision", y = "Count", title = "Manner of Collision Distribution") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

# 14. Plot number of accidents by hour of the day
accidents_by_hour_month <- Names_Changed %>%
  group_by(Accident_hour, Month_Name) %>%
  summarize(Count = n()) %>%
  ungroup()
## `summarise()` has grouped output by 'Accident_hour'. You can override using the
## `.groups` argument.
# 15. Plot the stacked bar plot of accidents by hour of the day, colored by month
ggplot(accidents_by_hour_month, aes(x = Accident_hour, y = Count, fill = factor(Month_Name))) +
  geom_col(position = "stack") +
  labs(title = "Number of Accidents by Hour of the Day and Month") +
  scale_fill_discrete(name = "Hour of the Day") +
  theme_minimal()

# 16. Scatter plot showing the relationship between age and the number of persons involved in accidents.
ggplot(Names_Changed, aes(x = Age, y = Number_of_persons_involved)) + 
  geom_point(alpha = 0.3) +
  labs(x = "Age", y = "Number of persons involved")

# 17. Heatmap showing the density of accidents based on the day of the month and the time of day. - Does this show that something is at like 8pm that is odd. There are a few times that seem to be defaults.
ggplot(Names_Changed, aes(x = Accident_hour, y = Accident_day)) + 
  geom_bin2d(bins = 20) +
  labs(x = "Hour of day", y = "Day of month", fill = "Number of accidents")

# 18. Bar plot showing the total number of accidents on each day of the week.
ggplot(Names_Changed, aes(x = Accident_weekday)) + 
  geom_bar() +
  labs(x = "Day of week", y = "Number of accidents")

# 19. Bar plot showing the total number of accidents in each year.
ggplot(Names_Changed, aes(x = Accident_year)) + 
  geom_bar() +
  labs(x = "Year", y = "Number of accidents")

# 20. Scatter plot showing the relationship between age and the time of day when accidents occur.
ggplot(Names_Changed, aes(x = Accident_hour, y = Age)) + 
  geom_point(alpha = 0.0005) +
  labs(x = "Hour of day", y = "Age")

# 21. Stacked bar plot showing the number of accidents for each vehicle body type, colored by the injury severity.
ggplot(Names_Changed, aes(x = Vehicle_body_type, fill = Injury_severity)) + 
  geom_bar(position = "stack") +
  labs(x = "Vehicle body type", y = "Number of accidents")

# 22. Box plot showing the distribution of age for each sex involved in accidents.
ggplot(Names_Changed, aes(x = Sex, y = Age)) + 
  geom_boxplot() +
  labs(x = "Sex", y = "Age")

# 23. Stacked bar plot showing the number of accidents for each month, colored by the injury severity.
ggplot(Names_Changed, aes(x = Month_Name, fill = Injury_severity)) + 
  geom_bar() +
  labs(x = "Month", y = "Number of accidents")+   theme(axis.text.x = element_text(angle = 90, hjust = 1))

# 24. Density plot showing the distribution of the number of persons involved in accidents.
ggplot(subset(Names_Changed, Number_of_persons_involved < 10), aes(x = Number_of_persons_involved)) + 
  geom_density() +
  labs(x = "Number of persons involved", y = "Density")

# 25. Box plot showing the distribution of age for each manner of collision.
ggplot(Names_Changed, aes(x = Manner_of_collision, y = Age)) + 
  geom_boxplot()

# 26. Heatmap of accidents by hour and day of the week.
accidents_by_hour_weekday <- Names_Changed %>%
  group_by(hour = hour(Datetime), weekday = wday(Datetime, label = TRUE)) %>%
  summarize(n = n())
## `summarise()` has grouped output by 'hour'. You can override using the
## `.groups` argument.
ggplot(accidents_by_hour_weekday, aes(x = hour, y = weekday, fill = n)) +
  geom_tile() +
  scale_fill_gradient(low = "#f7f7f7", high = "#e31a1c") +
  labs(x = "Hour of day", y = "Day of week", fill = "Number of accidents")

# 27. Faceted bar plot of accidents by vehicle body type and injury severity, colored by sex.

accidents_by_body_type_injury_sex <- Names_Changed %>%
  filter(!is.na(Vehicle_body_type), !is.na(Injury_severity)) %>%
  group_by(Vehicle_body_type, Injury_severity, Sex) %>%
  summarize(n = n())
## `summarise()` has grouped output by 'Vehicle_body_type', 'Injury_severity'. You
## can override using the `.groups` argument.
ggplot(accidents_by_body_type_injury_sex, aes(x = Vehicle_body_type, y = n, fill = Injury_severity))+
  geom_col(position = "dodge") +
  facet_wrap(~ Sex, scales = "free_y") +
  labs(x = "Vehicle body type", y = "Number of accidents", fill = "Injury severity")

# 28. Relationship between Hour of Day and Age of Persons Involved in Accidents, Colored by Injury Severity and Faceted by Sex.

ggplot(Names_Changed, aes(x = Accident_hour, y = Age, color = Injury_severity)) +
    geom_point(alpha = 0.01) +
    facet_wrap(~ Sex) +
    labs(x = "Hour of day", y = "Age", color = "Injury severity")+
    guides(color = guide_legend(override.aes = list(alpha = 1)))

accidents_per_day <- Names_Changed %>%
  mutate(DoY = yday(Datetime), Month = month(Datetime), Day = day(Datetime), Hour = hour(Datetime)) %>%
  group_by(Month, Day, Hour) %>%
  summarize(count = n()) %>%
  ungroup()
## `summarise()` has grouped output by 'Month', 'Day'. You can override using the
## `.groups` argument.
ggplot(accidents_per_day, aes(x = Day, y = Month, fill = count)) +
  geom_tile(color = "white") +
  scale_y_continuous(trans = 'reverse', breaks = 1:12, labels = month.abb) +
  labs(x = "Day", y = "Month", fill = "Accidents", title = "Accidents per Day of the Year (Aggregating All Years)") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

##

ggplot(accidents_per_day, aes(x = Day, y = Hour, fill = count - mean(count))) +
  geom_tile(width = 0.5) +
  scale_y_reverse() +
  scale_fill_gradient2(low = "blue", mid = "white", high = "red", midpoint = 0) +
  facet_wrap(~ month(Month, label = TRUE)) +
  theme(axis.text.x = element_text(angle = 90, vjust = 0, hjust = 1)) +
  labs(x = "Day", y = "Time", fill = "Accidents", title = "New Year\'s Eve and nighttime is dangerous!")

# Bivariate relationships - Scatter plot:
ggplot(data = Names_Changed, aes(x = Age, y = Number_of_persons_involved)) + 
  geom_point(alpha = 0.05) + 
  theme_minimal()

# Time-based analysis - Seasonal decomposition plot:

monthly_accidents <- Names_Changed %>%
  mutate(Month = floor_date(Datetime, "month")) %>%
  count(Month)

ggplot(monthly_accidents, aes(x = Month, y = n)) +
  geom_line() +
  theme_minimal() +
  labs(x = "Year", y = "Number of Accidents")

Next split into states or reigions like New England where hours of daylight will change and may cause more accidents

Maybe use MWStats for time series analysis